Purpose

Here we compare the count results of CellRanger versus scUTRquant using the 10X Chromium 3’-end demonstration data (v2 and v3). Specifically, we check for any systematic differences in genes.

Libraries

library(SingleCellExperiment)
library(DropletUtils)
library(GenomicRanges)
library(plyranges)
library(rtracklayer)
library(tidyverse)
library(ggbeeswarm)
library(magrittr)
library(cowplot)
library(Matrix)

Data

SAMPLE_SHEET="metadata/counts_sample_sheet.csv"
GTF="data/gtf/adult.utrome.e3.t200.f0.999.w500.gtf.gz"

df_txs <- import(GTF) %>%
  filter(type == 'exon') %>%
  mcols() %>% as_tibble() %>%
  group_by(gene_id) %>%
  summarize(n_exons=length(unique(exon_id)), 
            n_txs=length(unique(transcript_id))) %>%
  mutate(n_exons=cut(n_exons, breaks=c(0,1,2,4,9,Inf),
                     labels=c("1", "2", "3-4", "5-9", "10+")),
         n_txs=cut(n_txs, breaks=c(0,1,2,3,4,Inf),
                     labels=c("1", "2", "3", "4", "5+")))

gr_genes <- import(GTF, genome="mm10") %>%
  filter(type == 'gene') %>%
  mutate(gene_id=str_extract(gene_id, "^ENSMUSG[0-9]+")) %>%
  `names<-`(.$gene_id)
mcols(gr_genes) <- NULL

df_genes <- readRDS("data/utrs/utrome_genes_annotation.Rds") %>%
  merge(df_txs, by='gene_id', all.x=TRUE, all.y=FALSE) %>%
  `rownames<-`(str_extract(.$gene_id, "^ENSMUSG[0-9]+"))

## conform 10X cell ids to match scUTRquant cell_id
conform_cell_ids <- function (sample_id, sce) {
  colData(sce) %<>%
    as_tibble %>%
    mutate(bx=str_extract(Barcode, "^[ACGT]{16}"),
           cell_id=str_c(sample_id, bx, sep='_')) %>%
    select(cell_id, bx) %>%
    set_rownames(.$cell_id) %>%
    DataFrame()
  sce
}

## summarize scUTRquant transcript counts to gene counts
txs_to_genes <- function (sce) {
  M_genes_txs <- rowData(sce)$gene_id %>%
    fac2sparse %>%
    set_rownames(str_extract(rownames(.), "^ENSMUSG[0-9]+"))
  sce_g <- SingleCellExperiment(assays=list(counts=M_genes_txs %*% counts(sce)),
                                colData=colData(sce),
                                rowRanges=gr_genes[rownames(M_genes_txs)])
  rowData(sce_g) <- df_genes[rownames(sce_g),]
  sce_g
}
  
df_counts <- read_csv(SAMPLE_SHEET) %>%
  ## read SCE files
  mutate(sce_sq=map(file_scutrquant, readRDS),
         sce_10x=map(file_cellranger, read10xCounts)) %>%
  
  ## adjust cell_ids
  mutate(sce_10x=map2(sample_id, sce_10x, conform_cell_ids)) %>%
  
  ## summarize to gene counts
  mutate(sce_sq=map(sce_sq, txs_to_genes))

Plotting Methods

plot_umis_per_gene_compare_dot <- function (sce_x, sce_y, group=NULL,
                                            label_x="CellRanger UMI Counts Per Gene",
                                            label_y="scUTRquant UMI Counts Per Gene") {
  group <- enquo(group)
  idx_genes <- intersect(rownames(sce_x), rownames(sce_y))
  idx_cells <- intersect(colnames(sce_x), colnames(sce_y))
  df <- tibble(x=rowSums(counts(sce_x[idx_genes,idx_cells])),
               y=rowSums(counts(sce_y[idx_genes,idx_cells])),
               gene_id=idx_genes) %>%
    left_join(as_tibble(rowData(sce_y[idx_genes,]), rownames="gene_id2"), by=c("gene_id"="gene_id2"))
  ggplot(df, aes(x=x+1, y=y+1)) +
    geom_point(alpha=0.3, size=0.1, pch=16) +
    geom_abline(slope=1, intercept=0, linetype='dashed') +
    facet_wrap(vars(!!group)) +
    scale_x_log10() + scale_y_log10() +
    labs(x=label_x, y=label_y) +
    theme_minimal_grid()
}

plot_umis_per_gene_compare_density <- function (sce_x, sce_y, group=NULL,
                                                include_zeros=FALSE,
                                                label_x="CellRanger",
                                                label_y="scUTRquant") {
  group <- enquo(group)
  idx_genes <- intersect(rownames(sce_x), rownames(sce_y))
  idx_cells <- intersect(colnames(sce_x), colnames(sce_y))
  df <- tibble(x=rowSums(counts(sce_x[idx_genes,idx_cells])),
               y=rowSums(counts(sce_y[idx_genes,idx_cells])),
               gene_id=idx_genes) %>%
    filter(include_zeros | (x > 0 | y > 0)) %>%
    mutate(ratio=(1+y)/(1+x)) %>%
    left_join(as_tibble(rowData(sce_y[idx_genes,]), rownames="gene_id2"), by=c("gene_id"="gene_id2"))
  ggplot(df, aes(x=ratio)) +  
    geom_vline(xintercept=1, linetype='dashed') +
    geom_hline(yintercept=0, color='grey') +
    geom_density() +
    scale_x_log10(limits=c(1/100, 100)) +
    facet_wrap(vars(!!group), ncol=1) +
    labs(x=sprintf("%s/%s UMI Counts Per Gene", label_y, label_x), y="Density") +
    theme_minimal_vgrid()
}

plot_umis_per_gene_compare_violin <- function (sce_x, sce_y, group=NULL,
                                               include_zeros=FALSE,
                                               min_counts=0,
                                               label_x="CellRanger",
                                               label_y="scUTRquant") {
  group <- enquo(group)
  idx_genes <- intersect(rownames(sce_x), rownames(sce_y))
  idx_cells <- intersect(colnames(sce_x), colnames(sce_y))
  df_rd <- rowRanges(sce_y[idx_genes,]) %>% 
    as.data.frame(row.names=names(.)) %>%
    as_tibble(rownames='gene_id2')
  df <- tibble(x=rowSums(counts(sce_x[idx_genes,idx_cells])),
               y=rowSums(counts(sce_y[idx_genes,idx_cells])),
               gene_id=idx_genes) %>%
    filter(include_zeros | (x > min_counts | y > min_counts)) %>%
    mutate(ratio=(1+y)/(1+x)) %>%
    left_join(df_rd, by=c("gene_id"="gene_id2"))
  ggplot(df, aes(x=!!group, y=ratio)) +  
    geom_hline(yintercept=1, linetype='dashed') +
    geom_violin(draw_quantiles=c(0.25,0.5,0.75)) +
    scale_y_log10() +
    coord_cartesian(ylim=c(1/10, 10)) +
    labs(y=sprintf("%s/%s UMI Counts Per Gene", label_y, label_x)) +
    theme_bw()
}

UTR Type

Dots

df_counts %>%
  transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_dot, 
                              group=atlas.utr_type)) %>%
  deframe() %>% { 
    for (id in names(.)) { 
      ## print locally in this document
      print(.[[id]] + ggtitle(id))
    }
  }

Violin

df_counts %>%
  transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin, 
                              group=atlas.utr_type, include_zeros=FALSE)) %>%
  deframe() %>% { 
    for (id in names(.)) { 
      ## print locally in this document
      print(.[[id]] + ggtitle(id))
    }
  }

Number of Transcripts

Violin

df_counts %>%
  transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin, 
                              group=n_txs, include_zeros=FALSE)) %>%
  deframe() %>% { 
    for (id in names(.)) { 
      ## print locally in this document
      print(.[[id]] + ggtitle(id))
    }
  }

Number of Exons

Violin

df_counts %>%
  transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin, 
                              group=n_exons, include_zeros=FALSE)) %>%
  deframe() %>% { 
    for (id in names(.)) { 
      ## print locally in this document
      print(.[[id]] + ggtitle(id))
    }
  }

Chromosome

Violin

df_counts %>%
  transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin, 
                              group=seqnames, include_zeros=FALSE)) %>%
  deframe() %>% { 
    for (id in names(.)) { 
      ## print locally in this document
      print(.[[id]] + ggtitle(id))
    }
  }

Strand

Violin

df_counts %>%
  transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin, 
                              group=strand, include_zeros=FALSE)) %>%
  deframe() %>% { 
    for (id in names(.)) { 
      ## print locally in this document
      print(.[[id]] + ggtitle(id))
    }
  }

Has IPA

Violin

df_counts %>%
  transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin, 
                              group=has_ipa, include_zeros=FALSE)) %>%
  deframe() %>% { 
    for (id in names(.)) { 
      ## print locally in this document
      print(.[[id]] + ggtitle(id))
    }
  }

Number of Atlas Cell Types

Violin

df_counts %>%
  transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin, 
                              group=cut_number(atlas.ncelltypes_gene, n=6), 
                              include_zeros=FALSE)) %>%
  deframe() %>% { 
    for (id in names(.)) { 
      ## print locally in this document
      print(.[[id]] + ggtitle(id))
    }
  }

Min. 50 UMIs

df_counts %>%
  transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin, 
                              group=cut_number(atlas.ncelltypes_gene, n=8), 
                              include_zeros=FALSE, min_counts=50)) %>%
  deframe() %>% { 
    for (id in names(.)) { 
      ## print locally in this document
      print(.[[id]] + ggtitle(id))
    }
  }


Session Info

## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS/LAPACK: /Users/mfansler/miniconda3/envs/bioc_3_12/lib/libopenblasp-r0.3.12.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] BSgenome.Mmusculus.UCSC.mm10_1.4.0 BSgenome_1.58.0                   
##  [3] Biostrings_2.58.0                  XVector_0.30.0                    
##  [5] Matrix_1.3-2                       cowplot_1.1.0                     
##  [7] magrittr_2.0.1                     ggbeeswarm_0.6.0                  
##  [9] forcats_0.5.1                      stringr_1.4.0                     
## [11] dplyr_1.0.5                        purrr_0.3.4                       
## [13] readr_1.4.0                        tidyr_1.1.3                       
## [15] tibble_3.1.0                       ggplot2_3.3.3                     
## [17] tidyverse_1.3.0                    rtracklayer_1.50.0                
## [19] plyranges_1.10.0                   DropletUtils_1.10.0               
## [21] SingleCellExperiment_1.12.0        SummarizedExperiment_1.20.0       
## [23] Biobase_2.50.0                     GenomicRanges_1.42.0              
## [25] GenomeInfoDb_1.26.0                IRanges_2.24.0                    
## [27] S4Vectors_0.28.0                   BiocGenerics_0.36.0               
## [29] MatrixGenerics_1.2.0               matrixStats_0.58.0                
## 
## loaded via a namespace (and not attached):
##  [1] fs_1.5.0                  bitops_1.0-6             
##  [3] lubridate_1.7.10          httr_1.4.2               
##  [5] tools_4.0.2               backports_1.2.1          
##  [7] utf8_1.1.4                R6_2.5.0                 
##  [9] vipor_0.4.5               HDF5Array_1.18.0         
## [11] DBI_1.1.1                 colorspace_2.0-0         
## [13] rhdf5filters_1.2.0        withr_2.4.1              
## [15] tidyselect_1.1.0          compiler_4.0.2           
## [17] cli_2.3.1                 rvest_1.0.0              
## [19] xml2_1.3.2                DelayedArray_0.16.0      
## [21] scales_1.1.1              digest_0.6.27            
## [23] Rsamtools_2.6.0           rmarkdown_2.7            
## [25] R.utils_2.10.1            pkgconfig_2.0.3          
## [27] htmltools_0.5.1.1         sparseMatrixStats_1.2.0  
## [29] highr_0.8                 dbplyr_2.1.0             
## [31] limma_3.46.0              rlang_0.4.10             
## [33] readxl_1.3.1              rstudioapi_0.13          
## [35] DelayedMatrixStats_1.12.0 farver_2.1.0             
## [37] generics_0.1.0            jsonlite_1.7.2           
## [39] BiocParallel_1.24.0       R.oo_1.24.0              
## [41] RCurl_1.98-1.2            GenomeInfoDbData_1.2.4   
## [43] scuttle_1.0.0             Rcpp_1.0.6               
## [45] munsell_0.5.0             Rhdf5lib_1.12.0          
## [47] fansi_0.4.2               lifecycle_1.0.0          
## [49] R.methodsS3_1.8.1         stringi_1.5.3            
## [51] yaml_2.2.1                edgeR_3.32.0             
## [53] zlibbioc_1.36.0           rhdf5_2.34.0             
## [55] grid_4.0.2                dqrng_0.2.1              
## [57] crayon_1.4.1              lattice_0.20-41          
## [59] haven_2.3.1               beachmat_2.6.4           
## [61] hms_1.0.0                 locfit_1.5-9.4           
## [63] knitr_1.31                pillar_1.5.1             
## [65] reprex_1.0.0              XML_3.99-0.5             
## [67] glue_1.4.2                evaluate_0.14            
## [69] modelr_0.1.8              vctrs_0.3.6              
## [71] cellranger_1.1.0          gtable_0.3.0             
## [73] assertthat_0.2.1          xfun_0.20                
## [75] broom_0.7.5               beeswarm_0.3.1           
## [77] GenomicAlignments_1.26.0  ellipsis_0.3.1